import numpy as np
import pandas as pd
# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
# Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Visualisation libraries
## Progress Bar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we work on a dataset available from the UCI Machine Learning Repository. The data is related to direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe to a term deposit (variable y).
This dataset is based on the Bank Marketing dataset from the UC Irvine Machine Learning Repository. The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at bportugal.pt/estatisticasweb. This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns).
The data is related to the direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.
Data = pd.read_csv('Data/bank-additional-full_STD.csv')
Target = 'Term Deposit Subscription'
Labels = ['No', 'Yes']
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Header('Standardized Dataset')
display(Data.head(10).style.hide_index().set_precision(2))
Standardized Dataset ===============================================================================
| Age | Job | Marital | Education | Default | Housing | Loan | Contact | Month | Day Of Week | Duration | Campaign | Pdays | Previous | Poutcome | Employment Variation Rate | Consumer Price Index | Consumer Confidence Index | Euribor three Month Rate | Number of Employees | Term Deposit Subscription |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.53 | -1.65 | 0.28 | -1.54 | 0.51 | -0.92 | -0.32 | -1.32 | -0.79 | -1.40 | 0.01 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| 1.63 | -0.00 | 0.28 | 0.03 | -1.95 | -0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.42 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| -0.29 | -0.00 | 0.28 | 0.03 | 0.51 | 0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.12 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| -0.00 | 0.82 | 0.28 | -1.02 | 0.51 | -0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.41 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| 1.53 | -0.00 | 0.28 | 0.03 | 0.51 | -0.92 | 2.18 | -1.32 | -0.79 | -1.40 | 0.19 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| 0.48 | -0.00 | 0.28 | -0.49 | -1.95 | -0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.23 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| 1.82 | 0.82 | 0.28 | 0.55 | 0.51 | -0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.46 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| 0.09 | -0.82 | 0.28 | -2.59 | -1.95 | -0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.16 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| -1.54 | 0.41 | -1.36 | 0.55 | 0.51 | 0.92 | -0.32 | -1.32 | -0.79 | -1.40 | 0.47 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
| -1.44 | -0.00 | -1.36 | 0.03 | 0.51 | 0.92 | -0.32 | -1.32 | -0.79 | -1.40 | -0.80 | -0.57 | -0.17 | -0.35 | -0.37 | 0.65 | 0.72 | 0.89 | 0.71 | 0.33 | 0 |
First, consider the data distribution for Term Deposit Subscription.
def Dist_Table(Inp, Target = Target):
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(dict(zip([0,1],Labels)))
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
return Table
def Dist_Plot(Table, PieColors = ['FireBrick', 'SeaGreen'], TableColors = ['Navy','White']):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values, pull=[0, 0.1], textfont=dict(size=16),
marker=dict(colors = PieColors, line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"), legend_title_text= Target)
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.2, 0.2],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + 'Distribution' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Table = Dist_Table(Data)
Dist_Plot(Table)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
# Data = Data.drop(columns = ['Euribor three Month Rate', 'Number of Employees',
# 'Employment Variation Rate', 'Consumer Price Index'])
X = Data.drop(columns = Target).values
y = Data[Target].astype(float).values
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Plot(y_train = y_train, y_test = y_test, Colors = ['FireBrick', 'SeaGreen']):
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}]*2])
_, Temp = np.unique(y_train, return_counts=True)
fig.add_trace(go.Pie(labels=Labels,
values= Temp,
pull=[0, 0.1],
name= 'Train Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
_, Temp = np.unique(y_test, return_counts=True)
fig.add_trace(go.Pie(labels=Labels,
values=Temp,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Train_Test_Plot()
Therefore, we have divided the dataset into train and test set using stratification that preserves the distribution of classes in train and test sets.
A multi-layer perceptron (MLP) is a class of feedforward artificial neural network (ANN). The algorithm at each iteration uses the Cross-Entropy Loss to measure the loss, and then the gradient and the model update is calculated. At the end of this iterative process, we would reach a better level of agreement between test and predicted sets since the error would be lower from that of the first step.
model = keras.Sequential(name = 'Binary_MLP')
model.add(layers.Dense(64, input_dim = X.shape[1], activation='relu', name='Layer1'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu', name='Layer2'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid', name='Layer3'))
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True, rankdir = 'LR')
Model: "Binary_MLP" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= Layer1 (Dense) (None, 64) 1344 _________________________________________________________________ dropout (Dropout) (None, 64) 0 _________________________________________________________________ Layer2 (Dense) (None, 64) 4160 _________________________________________________________________ dropout_1 (Dropout) (None, 64) 0 _________________________________________________________________ Layer3 (Dense) (None, 1) 65 ================================================================= Total params: 5,569 Trainable params: 5,569 Non-trainable params: 0 _________________________________________________________________
Our model here utilizes the accuracy and recall scores.
# Number of iterations
IT = int(5e3)+1
model.compile(optimizer='sgd', loss='mse', metrics=['accuracy', tf.keras.metrics.Recall()])
# Train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs= IT, batch_size=128, verbose = 0)
Metrics_Names = {'loss':'Loss', 'accuracy':'Accuracy', 'mae':'MAE', 'mse':'MSE', 'recall': 'Recall'}
def Table_modify(df, Metrics_Names = Metrics_Names):
df = df.rename(columns = Metrics_Names)
df = df.reindex(sorted(df.columns), axis=1)
df.insert(loc = 0, column = 'Iteration', value = np.arange(0, df.shape[0]), allow_duplicates=False)
return df
Validation_Table = Search_List('val_',history.history.keys())
Train_Table = list(set( history.history.keys()) - set(Validation_Table))
Validation_Table = pd.DataFrame(np.array([history.history[x] for x in Validation_Table]).T, columns = Validation_Table)
Train_Table = pd.DataFrame(np.array([history.history[x] for x in Train_Table]).T, columns = Train_Table)
Validation_Table.columns = [x.replace('val_','') for x in Validation_Table.columns]
Train_Table = Table_modify(Train_Table)
Validation_Table = Table_modify(Validation_Table)
# Train Set Score
score = model.evaluate(X_test, y_test, batch_size=128, verbose = 0)
score = pd.DataFrame(score, index = model.metrics_names).T
score.index = ['Train Set Score']
# Validation Set Score
Temp = model.evaluate(X_train, y_train, batch_size=128, verbose = 0)
Temp = pd.DataFrame(Temp, index = model.metrics_names).T
Temp.index = ['Validation Set Score']
score = score.append(Temp)
score.rename(columns= Metrics_Names, inplace = True)
score = score.reindex(sorted(score.columns), axis=1)
display(score.style.set_precision(4))
| Accuracy | Loss | Recall | |
|---|---|---|---|
| Train Set Score | 0.9150 | 0.0575 | 0.5661 |
| Validation Set Score | 0.9214 | 0.0540 | 0.6062 |
def Plot_history(history, Title = False, Table_Rows = 25):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "scatter"},{"type": "table"}]])
# Left
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Loss'].values,
line=dict(color='OrangeRed', width= 1.5), name = 'Loss'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Accuracy'].values,
line=dict(color='MidnightBlue', width= 1.5), name = 'Accuracy'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Recall'].values,
line=dict(color='purple', width= 1.5), name = 'Recall'), 1, 1)
fig.update_layout(legend=dict(x=0, y=1.1, traceorder='reversed', font_size=12),
dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest',
legend_orientation='h')
fig.update_xaxes(range=[history.Iteration.min(), history.Iteration.max()],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
fig.update_yaxes(range=[0, 1], showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
# Right
ind = np.linspace(0, history.shape[0], Table_Rows, endpoint = False).round(0).astype(int)
ind = np.append(ind, history.index[-1])
history = history[history.index.isin(ind)]
T = history.copy()
T[['Accuracy','Loss','Recall']] = T[['Accuracy','Loss','Recall']].applymap(lambda x: '%.4e' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(history.columns), line_color='darkslategray',
fill_color='DimGray', align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.4, 0.4, 0.4],
cells=dict(values=Temp, line_color='darkslategray', fill=dict(color=['WhiteSmoke', 'white']),
align=['center', 'center'], font_size=12,height=20)), 1, 2)
if Title != False:
fig.update_layout(plot_bgcolor= 'white',
title={'text': Title, 'x':0.46, 'y':0.94, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.show()
Plot_history(Train_Table, Title = 'Train Set')
Plot_history(Validation_Table, Title = 'Validation Set')
The confusion matrix allows for visualization of the performance of an algorithm.
def Confusion_Matrix(X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test):
# Train
CM_Train = metrics.confusion_matrix(y_train, np.round(model.predict(X_train)))
# Test
CM_Test = metrics.confusion_matrix(y_test, np.round(model.predict(X_test)))
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set', 'Test Set']
CM = [CM_Train, CM_Test]
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
return CM_Train, CM_Test
CM_Train, CM_Test = Confusion_Matrix()
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [6] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('Train Set')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set', C = 'Green')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Train Set ========================================================================================== Precision (Train) = 0.67 Recall (Train) = 0.61 TPR (Train) = 0.61 TNR (Train) = 0.96 Balanced Accuracy (Train) = 0.78 Test Set =========================================================================================== Precision (Test) = 0.64 Recall (Test) = 0.57 TPR (Test) = 0.57 TNR (Test) = 0.96 Balanced Accuracy (Test) = 0.76 ====================================================================================================